# load library
library(tidyverse)
## -- Attaching packages ---------------------------------------------- tidyverse 1.3.0 --
## <U+2713> ggplot2 3.2.1     <U+2713> purrr   0.3.2
## <U+2713> tibble  2.1.3     <U+2713> dplyr   0.8.3
## <U+2713> tidyr   1.0.0     <U+2713> stringr 1.4.0
## <U+2713> readr   1.3.1     <U+2713> forcats 0.4.0
## -- Conflicts ------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2) 
library(gridExtra) # use to put graphs together in the same frame
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(scales)    # use to improve colors
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(janitor)  # piping function
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(sf)       # possible geomap
## Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3
library(kableExtra)    # clean table design
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(GGally) #used to display ggpairs
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(knitr)
library(treemapify)
library(ggthemes)
library(shiny)
library(vctrs)
library(MultNonParam) # hypothesis testing for median
library(ggforce)
library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
library(egg)
library(formattable)
## 
## Attaching package: 'formattable'
## The following objects are masked from 'package:scales':
## 
##     comma, percent, scientific
library(inspectdf) # New package employeed for base EDA
#library(DataExplorer)
#library(treemap)
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(mapdata)
library(maptools)
## Loading required package: sp
## Checking rgeos availability: TRUE
library(mapproj)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:formattable':
## 
##     area
## The following object is masked from 'package:dplyr':
## 
##     select
library(RgoogleMaps)
library(RColorBrewer)
library(plotGoogleMaps)
## Loading required package: spacetime
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
library(shiny)
library(leaflet)

devtools::install_github("dkahle/ggmap", ref = "tidyup")
## Skipping install of 'ggmap' from a github remote, the SHA1 (2d756e5e) has not changed since last install.
##   Use `force = TRUE` to force installation
# data import
crime <- read.csv("crime.csv")
  • Data of crimes in Vancouver(Canada) from 2003 to 2017

  • Data obtained from kaggle.com, originally comes from the Vancouver Open Data Catalogue, extracted on 2017-2018 with 530652 records between 2003-01-01 and 2017-07-13

    • 1.TYPE: Type of Crime
    • 2.YEAR: Year when the reported crime activity occurred
    • 3.MONTH: Month when the reported crime activity occurred
    • 4.DAY: Day when the reported crime activity occurred
    • 5.HOUR: Hour when the reported crime activity occurred
    • 6.MINUTE: Minute when the reported crime activity occurred
    • 7.HUNDRED_BLOCK: Generalized location of the reported crime activity
    • 8.NEIGHBOURHOOD: Neighbourhood where the reported crime activity occurred
    • 9.X: Coordinate values projected in UTM Zome 10
    • 10.Y: Coordinate values projected in UTM Zome 10
    • 11.Latitude: Coordinate values converted to Latitude
    • 12.Longitude: Coordinate values converted to Longitude

1 Graphical Profile of the Data

crime %>%
  inspect_types() %>%
  show_plot()

crime %>%
  inspect_cat() %>%
  show_plot()

crime %>%
  inspect_num %>%
  show_plot()

crime %>%
  inspect_cor() %>%
  show_plot()

#crime %>%
#  plot_intro()

#crime %>% plot_bar()
#crime %>% plot_histogram()
#crime %>% plot_correlation(maxcat = 5L)

2 Base EDA Step 1: Uni-variate non-Graphical EDA

head(crime, 10)
##                                 TYPE YEAR MONTH DAY HOUR MINUTE
## 1                        Other Theft 2003     5  12   16     15
## 2                        Other Theft 2003     5   7   15     20
## 3                        Other Theft 2003     4  23   16     40
## 4                        Other Theft 2003     4  20   11     15
## 5                        Other Theft 2003     4  12   17     45
## 6                        Other Theft 2003     3  26   20     45
## 7  Break and Enter Residential/Other 2003     3  10   12      0
## 8                           Mischief 2003     6  28    4     13
## 9                        Other Theft 2003     2  16    9      2
## 10 Break and Enter Residential/Other 2003     7   9   18     15
##        HUNDRED_BLOCK      NEIGHBOURHOOD        X       Y Latitude Longitude
## 1   9XX TERMINAL AVE         Strathcona 493906.5 5457452 49.26980 -123.0838
## 2   9XX TERMINAL AVE         Strathcona 493906.5 5457452 49.26980 -123.0838
## 3   9XX TERMINAL AVE         Strathcona 493906.5 5457452 49.26980 -123.0838
## 4   9XX TERMINAL AVE         Strathcona 493906.5 5457452 49.26980 -123.0838
## 5   9XX TERMINAL AVE         Strathcona 493906.5 5457452 49.26980 -123.0838
## 6   9XX TERMINAL AVE         Strathcona 493906.5 5457452 49.26980 -123.0838
## 7  63XX WILTSHIRE ST         Kerrisdale 489325.6 5452818 49.22805 -123.1466
## 8    40XX W 19TH AVE  Dunbar-Southlands 485903.1 5455884 49.25556 -123.1937
## 9   9XX TERMINAL AVE         Strathcona 493906.5 5457452 49.26980 -123.0838
## 10    18XX E 3RD AVE Grandview-Woodland 495078.2 5457221 49.26773 -123.0677
  • Data looks tidy, ready to start analysis

    • data in each column is of same variable type
# Examine the structure of the data
str(crime)
## 'data.frame':    530652 obs. of  12 variables:
##  $ TYPE         : Factor w/ 11 levels "Break and Enter Commercial",..: 6 6 6 6 6 6 2 4 6 2 ...
##  $ YEAR         : int  2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
##  $ MONTH        : int  5 5 4 4 4 3 3 6 2 7 ...
##  $ DAY          : int  12 7 23 20 12 26 10 28 16 9 ...
##  $ HOUR         : int  16 15 16 11 17 20 12 4 9 18 ...
##  $ MINUTE       : int  15 20 40 15 45 45 0 13 2 15 ...
##  $ HUNDRED_BLOCK: Factor w/ 21205 levels ""," / 3888 W 50TH AVE",..: 14372 14372 14372 14372 14372 14372 11454 8420 14372 2284 ...
##  $ NEIGHBOURHOOD: Factor w/ 25 levels "","Arbutus Ridge",..: 21 21 21 21 21 21 9 4 21 6 ...
##  $ X            : num  493907 493907 493907 493907 493907 ...
##  $ Y            : num  5457452 5457452 5457452 5457452 5457452 ...
##  $ Latitude     : num  49.3 49.3 49.3 49.3 49.3 ...
##  $ Longitude    : num  -123 -123 -123 -123 -123 ...
  • 530652 records of 12 variables
  • TYPE, HUNDRED_BLOCK, NEIGHBOURHOOD - Factor variables (Categorical)
  • YEAR, MONTH, DAY, HOUR, MINUTE - Continuous variables (Quantitative)
  • x, y, Latitude, Longitude - Continuous Numerical variables (Quantitative)
# Examine the descriptive statistics
crime %>%  summary()
##                                 TYPE             YEAR          MONTH       
##  Theft from Vehicle               :172700   Min.   :2003   Min.   : 1.000  
##  Mischief                         : 70413   1st Qu.:2005   1st Qu.: 4.000  
##  Break and Enter Residential/Other: 60862   Median :2009   Median : 6.000  
##  Offence Against a Person         : 54142   Mean   :2009   Mean   : 6.451  
##  Other Theft                      : 52167   3rd Qu.:2013   3rd Qu.: 9.000  
##  Theft of Vehicle                 : 38418   Max.   :2017   Max.   :12.000  
##  (Other)                          : 81950                                  
##       DAY             HOUR           MINUTE     
##  Min.   : 1.00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 8.00   1st Qu.: 9.00   1st Qu.: 0.00  
##  Median :15.00   Median :15.00   Median :10.00  
##  Mean   :15.41   Mean   :13.71   Mean   :16.94  
##  3rd Qu.:23.00   3rd Qu.:19.00   3rd Qu.:30.00  
##  Max.   :31.00   Max.   :23.00   Max.   :59.00  
##                  NA's   :54362   NA's   :54362  
##                    HUNDRED_BLOCK                      NEIGHBOURHOOD   
##  OFFSET TO PROTECT PRIVACY: 54362   Central Business District:110947  
##  7XX GRANVILLE ST         :  4629                            : 56624  
##  6XX GRANVILLE ST         :  3151   West End                 : 41352  
##  7XX W GEORGIA ST         :  2427   Fairview                 : 32161  
##  X NK_LOC ST              :  2244   Mount Pleasant           : 30536  
##  6XX W 41ST AVE           :  1900   Grandview-Woodland       : 27180  
##  (Other)                  :461939   (Other)                  :231852  
##        X                Y              Latitude       Longitude     
##  Min.   :     0   Min.   :      0   Min.   : 0.00   Min.   :-124.5  
##  1st Qu.:489945   1st Qu.:5453651   1st Qu.:49.24   1st Qu.:-123.1  
##  Median :491499   Median :5456840   Median :49.26   Median :-123.1  
##  Mean   :441802   Mean   :4897663   Mean   :44.22   Mean   :-110.5  
##  3rd Qu.:493547   3rd Qu.:5458638   3rd Qu.:49.28   3rd Qu.:-123.1  
##  Max.   :511303   Max.   :5512579   Max.   :49.76   Max.   :   0.0  
## 
  • Data observations

    • YEAR, MINUTE appears to be right-skewed so not symmetric
    • HOUR seems to be left-skewed so not symmetric
      • this means I will use median instead
    • The most frequent type of crime is “Theft from vehicle”
    • “OFFSET TO PROTECT PRIVACY” tops in the list, which will be treated as unknown in my analysis. Among the known locations of reported crime activity, “7xx GRANVILLE ST” takes the first place in the list
    • “Central Business District” has the highest crime incidents
    • 54362 records are missing for location and time information
  • Potential research questions / issues

    • When does crime occurs most frequently? (Year and time)
    • What areas are the most unsafe? Are the unsafe areas are continuous or segmented?
    • Are there certain neighbourhoods have high rate of particular type of crimes? If so, is there any underlying causes?
    • Any difference in the crime rate between different seasons or holiday/non-holiday seasons?

3 Base EDA Step 2: Uni-variate graphical EDA

Make a graph and examine for each variable individually

# YEAR
ggplot(crime, aes(x = YEAR)) + geom_line(stat="count") + theme_classic() + labs(title = "Crime Incidents by Year", x = "Year", y = "Number of Crimes" ) + theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

Comments

- Number of crimes decreases until 2011
- Starts and continues to increase from 2014 (2017 has only records up to July 13th)
# MONTH
ggplot(crime, aes(x = MONTH)) + geom_line(stat="count") + theme_classic() + labs(title = "Crime Incidents by Month", x = "Month", y = "Number of Crimes" ) + theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

Comments

- Number of crimes relatively evenly distributed across months
- February has slightly lower crime incidents
- Slight decline from Oct to Dec
# DAY
ggplot(crime, aes(x = DAY)) + geom_line(stat="count") + theme_classic() + labs(title = "Crime Incidents by Day", x = "Day", y = "Number of Crimes" ) + theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

Comments

- Crime rate is well distributed across days
- 1st and 15th day of month have higher crime incidents 
- The rate slightly decreases toward end of month 
- Rate for 31st day is about a half of other days because there are only seven months with 31days in a year.
# HOUR
# 54362 records are N/A for hour
ggplot(crime, aes(x = HOUR)) + geom_line(stat="count") + theme_classic() + labs(title = "Crime Incidents by Hour", x = "Hour", y = "Number of Crimes" ) + theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))
## Warning: Removed 54362 rows containing non-finite values (stat_count).

Comments

- Four significant peaks at 0, 12, and 18 hour 
- Substantial decline from 0 to 1 hour and continual decrease between 1 - 5 hour
- Rate growth until 18 hour (max peak)

Questions

- Unlike common perception about time of crime frequency, day time has high crime incident rate. In contrast, crime rate from early morning aka after midnight until 6am is the lowest throughout day. What are potential causes for this? 
- are benign and severe crimes share the same hour trend?
# MINUTE
# 54362 records are N/A for minute (same as hour)
ggplot(crime, aes(x = MINUTE)) + geom_line(stat="count") + theme_classic() + labs(title = "Crime Incidents by Minute", x = "MINUTE", y = "Number of Crimes" ) + theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))
## Warning: Removed 54362 rows containing non-finite values (stat_count).

Comments

- while it is well-distributed, 0 and 30 min have remarkably high crime rates 
- there are still relatively higher crime rates at 1st/3rd quaters
  • 7 Categorical Variables - TYPE, NEIGHBOURHOOD, X, Y, HUNDRED_BLOCK, Latitude, Longitude
# TYPE - flipped
crime %>%
ggplot( aes(x = TYPE)) + geom_bar() + theme_classic() + labs(title = "Crime Incidents by Type", x = "Type of Crime", y = "Number of Crimes" ) + theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue")) + coord_flip()

Comments

- The most frequent type of crime is "Theft from Vehicle"
- Very little number of incidents of "Vehicle Collision or Pedestrian Struck(with Fatality)" and "Homicide" 

Questions

- Split into groups of benign and severe crime? Would it give a different look into the data?
# NEIGHBOURHOOD -flipped
ggplot(crime, aes(x = NEIGHBOURHOOD)) + geom_bar() + theme_classic() + labs(title = "Crime Incidents by Neighbourhood", x = "Neighbourhood", y = "Number of Crimes" ) + theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue")) + coord_flip()

Comments

- 3 neighbourhoods (Central Business District, Arbutus Ridge, West End) have notably higher number of crimes
- Musqueam has very low number of crimes
- Crime rate is spread over among other neighbourhoods within certain range
  • X/Y, HUNDRED_BLOCK and Latitude/Longitude are not suitable to plot graphs for analysis (Too many different values for x). Therefore, I am presenting non-graphical analysis with count and summary statistics
crime_x <- crime[, "X"]
crime_y <- crime[, "Y"]
crime_h <- crime[, "HUNDRED_BLOCK"]
crime_Lt <- crime[, "Latitude"]
crime_Lg <- crime[, "Longitude"]

count(crime, X) %>% arrange(desc(n))
## # A tibble: 95,502 x 2
##          X     n
##      <dbl> <int>
##  1      0  54362
##  2 491446.  2502
##  3 492757.  2191
##  4 491399.  1738
##  5 497309.  1629
##  6 495037.  1457
##  7 491295.  1252
##  8 492934.  1232
##  9 491408.  1101
## 10 491401.  1095
## # … with 95,492 more rows
count(crime, Y) %>% arrange(desc(n))
## # A tibble: 93,544 x 2
##           Y     n
##       <dbl> <int>
##  1       0  54362
##  2 5458896.  2502
##  3 5458792.  2191
##  4 5458862.  1738
##  5 5456156.  1654
##  6 5456614.  1457
##  7 5458745.  1252
##  8 5456669.  1232
##  9 5458872.  1101
## 10 5458865.  1095
## # … with 93,534 more rows
count(crime, HUNDRED_BLOCK) %>% arrange(desc(n)) 
## # A tibble: 21,205 x 2
##    HUNDRED_BLOCK                 n
##    <fct>                     <int>
##  1 OFFSET TO PROTECT PRIVACY 54362
##  2 7XX GRANVILLE ST           4629
##  3 6XX GRANVILLE ST           3151
##  4 7XX W GEORGIA ST           2427
##  5 X NK_LOC ST                2244
##  6 6XX W 41ST AVE             1900
##  7 3XX ABBOTT ST              1795
##  8 31XX GRANDVIEW HWY         1784
##  9 11XX ROBSON ST             1758
## 10 17XX E BROADWAY AVE        1718
## # … with 21,195 more rows
count(crime, Latitude) %>% arrange(desc(n))
## # A tibble: 101,885 x 2
##    Latitude     n
##       <dbl> <int>
##  1      0   54362
##  2     49.3  2502
##  3     49.3  2191
##  4     49.3  1738
##  5     49.3  1629
##  6     49.3  1457
##  7     49.3  1252
##  8     49.3  1232
##  9     49.3  1101
## 10     49.3  1095
## # … with 101,875 more rows
count(crime, Longitude) %>% arrange(desc(n))
## # A tibble: 98,972 x 2
##    Longitude     n
##        <dbl> <int>
##  1        0  54362
##  2     -123.  2502
##  3     -123.  2191
##  4     -123.  1738
##  5     -123.  1629
##  6     -123.  1459
##  7     -123.  1252
##  8     -123.  1232
##  9     -123.  1101
## 10     -123.  1095
## # … with 98,962 more rows
summary(crime_x) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0  489945  491499  441802  493547  511303
summary(crime_y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0 5453651 5456840 4897663 5458638 5512579
summary(crime_h)
## OFFSET TO PROTECT PRIVACY          7XX GRANVILLE ST          6XX GRANVILLE ST 
##                     54362                      4629                      3151 
##          7XX W GEORGIA ST               X NK_LOC ST            6XX W 41ST AVE 
##                      2427                      2244                      1900 
##             3XX ABBOTT ST        31XX GRANDVIEW HWY            11XX ROBSON ST 
##                      1795                      1784                      1758 
##       17XX E BROADWAY AVE           5XX RICHARDS ST        3XX E BROADWAY AVE 
##                      1718                      1462                      1437 
##         5XX W HASTINGS ST            23XX CAMBIE ST              1XX WATER ST 
##                      1424                      1306                      1249 
##           10XX BURRARD ST          3XX SE MARINE DR           7XX DUNSMUIR ST 
##                      1162                      1139                      1113 
##            9XX SEYMOUR ST         34XX KINGSWAY AVE            8XX AVISON WAY 
##                      1101                      1071                      1048 
##          1XX W GEORGIA ST             11XX DAVIE ST            8X W PENDER ST 
##                      1021                      1012                      1004 
##             16XX DAVIE ST           6XX RICHARDS ST         12XX GRANVILLE ST 
##                       979                       973                       953 
##            9XX BURRARD ST            5XX SEYMOUR ST          7XX PACIFIC BLVD 
##                       946                       944                       918 
##            10XX ROBSON ST         33XX KINGSWAY AVE        11XX W HASTINGS ST 
##                       915                       864                       844 
##        15XX COMMERCIAL DR            10XX BEACH AVE        25XX E HASTINGS ST 
##                       833                       821                       819 
##              11XX HOWE ST        18XX E HASTINGS ST           1XX W PENDER ST 
##                       795                       776                       771 
##          14XX ANDERSON ST            8XX BURRARD ST         1XX E HASTINGS ST 
##                       756                       740                       732 
##            12XX ROBSON ST            4XX SEYMOUR ST          5XX GRANVILLE ST 
##                       719                       716                       715 
##             5XX ABBOTT ST          3XX W GEORGIA ST            14XX QUEBEC ST 
##                       695                       673                       664 
##             1XX KEEFER ST           11XX ALBERNI ST            14XX ROBSON ST 
##                       652                       648                       645 
##         11XX GRANVILLE ST         10XX GRANVILLE ST        35XX GRANDVIEW HWY 
##                       637                       635                       627 
##          2XX E GEORGIA ST          12XX RICHARDS ST            6XX SEYMOUR ST 
##                       624                       619                       618 
##             7XX ROBSON ST            17XX ROBSON ST          9XX TERMINAL AVE 
##                       618                       615                       610 
##            7XX BURRARD ST               7XX BUTE ST        5XX W BROADWAY AVE 
##                       601                       596                       592 
##          9XX GRANVILLE ST           10XX SEYMOUR ST        8XX W BROADWAY AVE 
##                       586                       585                       583 
##          56XX VICTORIA DR            8XX SEYMOUR ST        6XX W BROADWAY AVE 
##                       581                       570                       562 
##          8XX GRANVILLE ST        1XX W BROADWAY AVE         10XX W GEORGIA ST 
##                       562                       555                       554 
##           8XX HAMILTON ST            5XX W 12TH AVE          9XX W CORDOVA ST 
##                       546                       541                       538 
##             5XX HORNBY ST              3XX WATER ST             5XX BEATTY ST 
##                       530                       525                       510 
##           10XX ALBERNI ST             10XX HOMER ST      56XX STANLEY PARK DR 
##                       509                       508                       492 
##          1XX W CORDOVA ST              10XX HOWE ST            7XX SEYMOUR ST 
##                       490                       482                       481 
##            8XX W 12TH AVE            13XX ROBSON ST            55XX CAMBIE ST 
##                       471                       470                       469 
##           1XX E PENDER ST       BUTE ST / ROBSON ST             12XX DAVIE ST 
##                       467                       463                       462 
##  SEYMOUR ST / W PENDER ST            12XX HORNBY ST          16XX JOHNSTON ST 
##                       462                       461                       459 
##          11XX MELVILLE ST           11XX BURRARD ST             5XX W 8TH AVE 
##                       453                       452                       451 
##          1XX E CORDOVA ST              8XX HOMER ST           11XX HARWOOD ST 
##                       449                       444                       438 
##                   (Other) 
##                    391276
summary(crime_Lt) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   49.24   49.26   44.22   49.28   49.76
summary(crime_Lg) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -124.5  -123.1  -123.1  -110.5  -123.1     0.0

Comments

- 54362 incidents are missing in all location information
- X/Y and Latitude/Logitude share the same list for number of crimes

4 Base EDA Step 3: Multi-variate non-graphical EDA

4.1 Quantitative / Numerical Variables

Do all crime types share the same trend through time?

# Comparing different type of crimes by YEAR
crime %>% 
  tabyl(YEAR, TYPE) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  kable()
YEAR Break and Enter Commercial Break and Enter Residential/Other Homicide Mischief Offence Against a Person Other Theft Theft from Vehicle Theft of Bicycle Theft of Vehicle Vehicle Collision or Pedestrian Struck (with Fatality) Vehicle Collision or Pedestrian Struck (with Injury) Total
2003 3197 6883 18 6391 3507 2582 17744 1418 6361 24 1803 49928
2004 3283 6538 22 5601 3804 2605 18204 1230 6102 22 1868 49279
2005 2639 5542 22 5062 3771 2611 16554 1416 5031 32 1984 44664
2006 2844 5674 17 5184 4350 2966 14734 1467 3682 20 1384 42322
2007 2436 4996 19 4810 4412 3024 12226 1203 3305 24 1237 37692
2008 2224 4432 18 5276 4226 3142 11298 1176 2420 17 1185 35414
2009 1858 3497 18 4430 3885 3662 10007 1641 1882 14 1278 32172
2010 1656 3270 10 4506 3731 3432 8612 1667 1467 10 1327 29688
2011 1749 3231 15 4828 3870 3562 7435 1517 1093 11 1262 28573
2012 1687 3311 8 4243 3786 3630 8097 1817 1151 18 1474 29222
2013 1774 3025 7 4191 3663 3488 8340 2034 1034 15 1485 29056
2014 2244 3044 9 4518 3158 4210 10137 2461 1290 13 1575 32659
2015 2457 3121 15 4193 3202 4679 10544 3063 1371 14 1669 34328
2016 2686 2994 11 4599 3172 5708 12806 2634 1474 15 1699 37798
2017 1111 1304 11 2581 1605 2866 5962 986 755 5 671 17857
Total 33845 60862 220 70413 54142 52167 172700 25730 38418 254 21901 530652
# Comparing different type of crimes by MONTH
crime %>% 
  tabyl(MONTH, TYPE) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  kable()
MONTH Break and Enter Commercial Break and Enter Residential/Other Homicide Mischief Offence Against a Person Other Theft Theft from Vehicle Theft of Bicycle Theft of Vehicle Vehicle Collision or Pedestrian Struck (with Fatality) Vehicle Collision or Pedestrian Struck (with Injury) Total
1 3059 5783 17 5711 4794 4619 14943 993 3499 29 1771 45218
2 2712 4723 23 5378 4121 4615 13171 1034 3038 17 1600 40432
3 3124 5042 26 6278 4562 4966 14670 1370 3467 22 1787 45314
4 2838 4963 19 6245 4353 4604 14389 1805 3099 19 1704 44038
5 2889 5032 22 6431 4823 4695 15296 2609 3206 19 1861 46883
6 3010 4968 13 6541 4628 4561 14637 3159 3294 23 1877 46711
7 2693 4794 18 6076 4766 4238 14236 3857 3156 25 1820 45679
8 2682 4828 25 5810 4847 4113 14638 3732 3298 21 1952 45946
9 2683 5021 17 5469 4325 3999 14404 2793 3150 15 1874 43750
10 2811 5298 11 6009 4530 3933 14468 2229 3338 21 1918 44566
11 2655 5269 17 5310 4216 3939 14282 1287 3018 25 1928 41946
12 2689 5141 12 5155 4177 3885 13566 862 2855 18 1809 40169
Total 33845 60862 220 70413 54142 52167 172700 25730 38418 254 21901 530652
# Comparing different type of crimes by DAY
crime %>% 
  tabyl(DAY, TYPE) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  kable()
DAY Break and Enter Commercial Break and Enter Residential/Other Homicide Mischief Offence Against a Person Other Theft Theft from Vehicle Theft of Bicycle Theft of Vehicle Vehicle Collision or Pedestrian Struck (with Fatality) Vehicle Collision or Pedestrian Struck (with Injury) Total
1 1175 2117 12 2802 2581 1567 6128 1054 1330 9 737 19512
2 1149 1989 9 2248 1796 1668 5768 825 1279 8 694 17433
3 1120 2077 7 2259 1778 1758 5778 818 1243 9 670 17517
4 1082 2059 8 2221 1751 1740 5592 913 1308 7 708 17389
5 1144 2035 4 2324 1774 1835 5805 854 1299 6 718 17798
6 1143 1988 10 2222 1727 1905 5922 831 1234 7 736 17725
7 1170 2135 7 2310 1715 1799 6038 917 1321 11 763 18186
8 1153 2125 10 2341 1789 1883 5875 893 1286 12 682 18049
9 1126 2115 9 2263 1690 1809 5879 798 1324 9 694 17716
10 1140 2120 7 2280 1692 1834 5908 873 1329 10 679 17872
11 1234 2039 15 2316 1763 1936 6012 888 1290 10 668 18171
12 1209 2113 5 2328 1735 1951 6028 909 1353 2 755 18388
13 1128 2150 5 2288 1776 1806 6056 905 1312 7 696 18129
14 1144 2151 8 2309 1771 1888 5968 861 1309 5 751 18165
15 1306 2119 10 2783 1796 1868 6260 1008 1356 8 784 19298
16 1141 2102 8 2235 1826 1907 5947 864 1285 11 750 18076
17 1120 1995 7 2314 1726 1849 5976 876 1254 11 735 17863
18 1072 2017 4 2274 1736 1798 5681 889 1260 4 741 17476
19 1082 1992 5 2207 1699 1775 5674 835 1180 7 729 17185
20 1131 2032 4 2365 1685 1718 5802 854 1196 8 753 17548
21 1098 1958 1 2232 1727 1622 5340 910 1214 9 679 16790
22 1044 1891 1 2273 1781 1615 5376 820 1195 5 734 16735
23 1068 1905 6 2283 1651 1588 5211 737 1191 6 753 16399
24 1087 1809 8 2315 1746 1425 5210 744 1227 7 718 16296
25 1005 1744 5 2182 1819 1394 4933 738 1163 7 666 15656
26 949 1752 7 2270 1707 1496 5121 700 1158 10 703 15873
27 1022 1759 10 2167 1682 1459 5115 725 1190 8 735 15872
28 992 1835 8 2212 1808 1517 5124 796 1227 17 727 16263
29 946 1779 9 2156 1689 1407 4860 775 1181 11 656 15469
30 972 1780 8 2075 1644 1476 5008 691 1217 6 672 15549
31 693 1180 3 1559 1082 874 3305 429 707 7 415 10254
Total 33845 60862 220 70413 54142 52167 172700 25730 38418 254 21901 530652
# Comparing different type of crimes by HOUR
crime %>% 
  tabyl(HOUR, TYPE) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  kable()
HOUR Break and Enter Commercial Break and Enter Residential/Other Homicide Mischief Offence Against a Person Other Theft Theft from Vehicle Theft of Bicycle Theft of Vehicle Vehicle Collision or Pedestrian Struck (with Fatality) Vehicle Collision or Pedestrian Struck (with Injury) Total
0 2326 3049 0 5616 0 369 11328 1687 2316 9 477 27177
1 1635 1345 0 3390 0 285 4012 500 1093 12 449 12721
2 1808 1188 0 2819 0 223 2786 314 813 6 380 10337
3 2267 1099 0 2189 0 192 1976 205 499 11 319 8757
4 2589 1026 0 1636 0 156 1674 189 408 10 224 7912
5 2383 1037 0 1336 0 179 1704 204 348 8 263 7462
6 1489 1397 0 1283 0 225 2336 320 538 3 510 8101
7 990 2220 0 1531 0 368 3568 571 729 6 819 10802
8 822 3730 0 2157 0 623 5296 1095 1056 6 1149 15934
9 771 3247 0 2205 0 1355 5648 1121 1073 10 1046 16476
10 690 3049 0 2029 0 2420 4867 916 959 11 1050 15991
11 545 2836 0 1965 0 3136 4755 866 902 9 1041 16055
12 689 3872 0 2848 0 3646 8224 1651 1553 17 1113 23613
13 476 2757 0 2048 0 4253 5765 1154 991 10 1069 18523
14 554 2946 0 2249 0 4824 6418 1297 1258 17 1231 20794
15 783 3037 0 2689 0 5229 7596 1529 1464 19 1550 23896
16 1142 2810 0 3092 0 5148 8984 1581 1775 8 1545 26085
17 1955 3209 0 3814 0 4947 11586 1822 2456 16 1816 31621
18 2344 3640 0 4455 0 4231 14942 2060 3031 13 1479 36195
19 1420 2991 0 3902 0 3581 12161 1634 2802 14 1085 29590
20 1327 2657 0 3974 0 2857 11645 1464 2879 9 895 27707
21 1427 2587 0 4247 0 1974 11721 1294 3202 6 903 27361
22 1640 2650 0 4626 0 1282 13106 1285 3361 14 827 28791
23 1773 2483 0 4313 0 664 10602 971 2912 10 661 24389
NA 0 0 220 0 54142 0 0 0 0 0 0 54362
Total 33845 60862 220 70413 54142 52167 172700 25730 38418 254 21901 530652
# Comparing different type of crimes by MINUTE
crime %>% 
  tabyl(MINUTE, TYPE) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  kable()
MINUTE Break and Enter Commercial Break and Enter Residential/Other Homicide Mischief Offence Against a Person Other Theft Theft from Vehicle Theft of Bicycle Theft of Vehicle Vehicle Collision or Pedestrian Struck (with Fatality) Vehicle Collision or Pedestrian Struck (with Injury) Total
0 11602 27328 0 25813 0 3259 101683 14440 22242 7 965 207339
1 468 704 0 746 0 686 1093 137 246 2 309 4391
2 223 244 0 535 0 736 287 66 54 3 309 2457
3 215 275 0 470 0 678 334 61 59 3 304 2399
4 224 237 0 469 0 676 305 61 57 2 325 2356
5 460 514 0 812 0 1062 769 158 205 7 444 4431
6 222 257 0 489 0 678 313 67 60 2 304 2392
7 232 229 0 518 0 685 336 66 52 2 332 2452
8 250 272 0 529 0 754 348 60 58 3 334 2608
9 241 227 0 525 0 704 321 72 56 4 282 2432
10 593 871 0 1028 0 1221 1468 304 327 10 517 6339
11 227 241 0 517 0 686 297 51 56 1 334 2410
12 262 249 0 526 0 674 348 75 43 5 280 2462
13 242 270 0 465 0 681 292 65 54 3 339 2411
14 247 240 0 520 0 711 311 56 52 1 312 2450
15 1031 1920 0 1877 0 1544 4717 807 1022 10 510 13438
16 251 252 0 485 0 695 286 52 64 0 307 2392
17 238 257 0 490 0 663 350 70 59 3 306 2436
18 238 269 0 538 0 744 321 61 36 4 320 2531
19 207 254 0 490 0 705 288 66 49 3 307 2369
20 734 1036 0 1259 0 1371 1952 376 430 9 569 7736
21 215 256 0 484 0 698 328 54 52 3 284 2374
22 240 249 0 483 0 685 310 60 59 2 300 2388
23 235 277 0 511 0 675 304 68 68 1 288 2427
24 237 272 0 533 0 681 370 71 60 3 316 2543
25 434 529 0 845 0 1093 765 169 156 12 444 4447
26 220 239 0 503 0 714 314 62 62 5 295 2414
27 228 266 0 507 0 702 328 52 74 6 322 2485
28 257 249 0 518 0 714 313 61 56 4 323 2495
29 221 255 0 474 0 668 320 42 60 5 312 2357
30 4258 10926 0 8387 0 2612 34494 4536 8584 14 835 74646
31 231 262 0 492 0 680 322 48 60 4 303 2402
32 207 250 0 477 0 689 309 49 57 4 307 2349
33 207 258 0 501 0 744 345 74 61 0 327 2517
34 230 264 0 522 0 705 326 59 71 7 278 2462
35 418 450 0 800 0 1061 692 153 166 8 471 4219
36 240 259 0 532 0 728 295 50 63 2 348 2517
37 216 242 0 525 0 718 321 56 68 4 322 2472
38 249 238 0 515 0 703 315 60 64 3 308 2455
39 236 259 0 516 0 709 281 51 67 4 318 2441
40 694 971 0 1175 0 1373 1611 306 364 4 531 7029
41 234 264 0 498 0 666 303 65 56 6 303 2395
42 233 241 0 500 0 762 334 47 62 3 330 2512
43 257 255 0 451 0 702 312 51 52 3 345 2428
44 237 258 0 501 0 665 306 59 56 4 321 2407
45 1020 2210 0 1991 0 1761 5586 956 1244 8 551 15327
46 218 224 0 487 0 651 331 53 39 1 304 2308
47 249 269 0 519 0 708 343 57 66 2 328 2541
48 240 255 0 508 0 736 352 68 68 8 302 2537
49 205 249 0 503 0 633 309 48 59 3 325 2334
50 736 963 0 1158 0 1398 1855 342 415 8 575 7450
51 188 225 0 474 0 722 293 67 51 5 309 2334
52 232 257 0 491 0 697 312 76 58 3 306 2432
53 234 239 0 524 0 725 333 70 56 5 338 2524
54 248 241 0 526 0 666 315 74 56 3 304 2433
55 424 553 0 786 0 1077 843 184 210 7 462 4546
56 231 248 0 546 0 685 299 51 54 3 320 2437
57 257 249 0 533 0 655 298 60 56 3 309 2420
58 247 227 0 476 0 675 321 64 55 0 304 2369
59 275 318 0 540 0 618 573 86 152 0 324 2886
NA 0 0 220 0 54142 0 0 0 0 0 0 54362
Total 33845 60862 220 70413 54142 52167 172700 25730 38418 254 21901 530652

Comments

  • TYPE by Year - All crime types share steady decline until 2010 and remain low rate during the period between 2010 - 2013, and increase again after 2014

    • TYPE by Month
      • Slightly higher number of incidents across all crime types in summer
      • Crime rate drops in all crime types in December and substantial increase in January (Holiday season effect?)
    • TYPE by DAY
      • Last day of month have relatively low number of incidents than first day of month in all types of crimes
    • TYPE by HOUR
      • Break and Enter Commercial has high number of incidents during late night and early morning
      • Almost all types of crime incidents increases around 18 hour
      • N/A for Homicide and Offence Against a Person
    • TYPE by MINUTE
      • high number of crimes at all quarters whilte 0 and 30 miniute have significant peaks. It tells that large portion of crimes are planned/organized ## Categorical / Factor Variables
# Comparing different type of crimes by NEIGHBOURHOOD
crime %>% 
  tabyl(NEIGHBOURHOOD, TYPE) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  kable()
NEIGHBOURHOOD Break and Enter Commercial Break and Enter Residential/Other Homicide Mischief Offence Against a Person Other Theft Theft from Vehicle Theft of Bicycle Theft of Vehicle Vehicle Collision or Pedestrian Struck (with Fatality) Vehicle Collision or Pedestrian Struck (with Injury) Total
3 3 220 253 54142 4 1809 110 66 0 14 56624
Arbutus Ridge 325 1672 0 934 0 337 1852 160 498 3 285 6066
Central Business District 9371 3505 0 16672 0 19244 48003 6907 4016 41 3188 110947
Dunbar-Southlands 294 1847 0 1324 0 241 2899 240 629 3 269 7746
Fairview 3303 3834 0 3196 0 3269 11934 3394 2037 14 1180 32161
Grandview-Woodland 2082 4515 0 4970 0 2508 7342 1403 3111 9 1240 27180
Hastings-Sunrise 929 3199 0 2904 0 1379 5654 321 2452 18 1270 18126
Kensington-Cedar Cottage 1277 4136 0 3760 0 2961 7474 859 2919 11 1544 24941
Kerrisdale 326 1826 0 1049 0 265 2805 179 547 7 443 7447
Killarney 302 2130 0 1761 0 245 3990 163 1302 12 570 10475
Kitsilano 2092 4390 0 3692 0 1730 8912 2464 2366 13 1040 26699
Marpole 1098 2527 0 1905 0 612 4151 232 1617 10 931 13083
Mount Pleasant 2769 3278 0 4070 0 3698 9679 2746 2654 18 1624 30536
Musqueam 17 86 0 104 0 1 217 7 40 1 59 532
Oakridge 332 2089 0 889 0 1176 2290 172 669 6 414 8037
Renfrew-Collingwood 1197 4296 0 3886 0 4119 8420 419 3011 12 1401 26761
Riley Park 848 2706 0 1795 0 410 4269 621 1197 4 671 12521
Shaughnessy 129 1774 0 633 0 25 1769 139 371 7 579 5426
South Cambie 314 1109 0 606 0 759 1529 221 435 2 237 5212
Stanley Park 72 65 0 246 0 13 2868 214 74 6 217 3775
Strathcona 2168 2019 0 4556 0 994 7343 1015 1650 20 1154 20919
Sunset 1105 2578 0 3243 0 1401 5226 255 2275 17 1296 17396
Victoria-Fraserview 386 2499 0 1761 0 483 3390 132 1372 10 786 10819
West End 2775 3480 0 5325 0 6033 16904 2985 2660 6 1184 41352
West Point Grey 331 1299 0 879 0 260 1971 372 450 4 305 5871
Total 33845 60862 220 70413 54142 52167 172700 25730 38418 254 21901 530652

Comments

  • Central Business Disctrict wins the top in all types of crimes except Homicide(N/A), Offence Against a Person(N/A), Break and Enter Residential/Other
    • Grandview-Woodland has the highest number in Break and Enter Residential/Other

5 Base EDA Step 3: Multi-variate non-graphical EDA

5.1 Crime rates by type over time

### TYPE ###

# Number by Crime Type by Year
ggplot(crime, aes(x = YEAR, color = TYPE)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Number by Crime Types by Year", x = "Year", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

# Number by Crime Types by Month
ggplot(crime, aes(x = MONTH, color = TYPE)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Numbers by Crime Types by Month", x = "Month", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "red"))

# Number by Crime Types by Day
ggplot(crime, aes(x = DAY, color = TYPE)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Numbers by Crime Types by Day", x = "Day", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "red"))

# Number by Crime Types by Hour
ggplot(crime, aes(x = HOUR, color = TYPE)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Numbers by Crime Types by Hour", x = "Hour", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))
## Warning: Removed 54362 rows containing non-finite values (stat_count).

# Number by Crime Types by Minute
ggplot(crime, aes(x = MINUTE, color = TYPE)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Numbers by Crime Types by Minute", x = "Minute", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "red "))
## Warning: Removed 54362 rows containing non-finite values (stat_count).

- TYPE by YEAR - Other Theft continues to increase - Mischief streadily decreases with a bit of up and down through years - Theft of Vehicle/Break and Enter Residential/Other sizably drops through years - While Theft from Vehicle takes majority of crime incidents, number of cases sharply declines from 2004 to 2011, then it starts to bounce up until recently years.

  • TYPE by MONTH
    • Rate of Theft of Bicycle rises and peak during summer/ drops and stays low during winter
  • TYPE by DAY
    • Except Other Theft, number of all crime types are high on first day of month
    • While most of crime rates slowly declines toward end of month, there is a sudden crest on day 15
  • TYPE by HOUR
    • Majority follows the trend of low rate during early morning(1-5 hour), increase day time until 18 hour, and remain high during night(18-24 hour)
    • Other Theft stays low during early morning and peaks the top at 15 hour
    • Break and Enter Commercial/Other happens more during early morning and time around 18 hour
  • TYPE by MINUTE
    • Most crime incidents happens at quaters while the majority of them happens at 0 and 30 minute
  • Graphs are too crowded as there are too many types of crimes. I will try to group them into 3-4 based on their patterns over time.

5.1.1 Year

  • Group by patterns over years

  • Group 1(increase-decrease-increase): ‘Break and Enter Commercial’, ‘Theft from Vehicle’, ‘Vehicle Collision or Pedestrian Struck (with Injury)’

  • Group 2(constantly decrease): ‘Break and Enter Residential/Other’, ‘Mischief’, ‘Theft of Vehicle’, ‘Offence Against a Person’

  • Group 3(constantly increase): ‘Other Theft’, ‘Theft of Bicycle’

  • Group 4 (number of cases too small to get a conclusive pattern): ‘Homicide’, ‘Vehicle Collision or Pedestrian Struck (with Fatality)’

ggplot(crime, aes(x = YEAR, color = TYPE)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Number by Crime Types by Year", x = "Year", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

#-------------------------------------#
comm <- crime %>% filter(TYPE %in% 'Break and Enter Commercial')
resit <- crime %>% filter(TYPE %in% 'Break and Enter Residential/Other')
homi <- crime %>% filter(TYPE %in% 'Homicide')
mis <- crime %>% filter(TYPE %in% 'Mischief')
off <- crime %>% filter(TYPE %in% 'Offence Against a Person')
ot <- crime %>% filter(TYPE %in% 'Other Theft')
tfv <- crime %>% filter(TYPE %in% 'Theft from Vehicle')
tb <- crime %>% filter(TYPE %in% 'Theft of Bicycle')
tv <- crime %>% filter(TYPE %in% 'Theft of Vehicle')
vcf <- crime %>% filter(TYPE %in% 'Vehicle Collision or Pedestrian Struck (with Fatality)')
vci <- crime %>% filter(TYPE %in% 'Vehicle Collision or Pedestrian Struck (with Injury)')

grid.arrange(
  ggplot(comm, aes(x = YEAR)) + geom_line(stat = 'count', color = 'blue') + geom_point(stat = 'count', color = 'blue') + theme_classic() + labs(title = "Break and Enter Commercial" , x= ' ' , y = ' '), 
 
  ggplot(resit, aes(x = YEAR)) + geom_line(stat = 'count', color = 'red') + geom_point(stat = 'count', color = 'red') + theme_classic() + labs(title = "Break and Enter Residential/Other", x= ' ' , y = ' ' ), 
  
  ggplot(homi, aes(x = YEAR)) + geom_line(stat = 'count', color = ' green') + geom_point(stat = 'count', color = 'green') +  theme_classic() + labs(title = "Homicide", x= ' ' , y = ' ' ),
  
  ggplot(mis, aes(x = YEAR)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = "Mischief", x= ' ' , y = ' ' ),
  
  ggplot(off, aes(x = YEAR)) + geom_line(stat = 'count', color = 'dark green') + geom_point(stat = 'count', color = 'dark green') +  theme_classic() + labs(title = "Offence Against a Person", x= ' ' , y = ' ' ),
  
  ggplot(ot, aes(x = YEAR)) + geom_line(stat = 'count', color = 'pink') + geom_point(stat = 'count', color = 'pink') +  theme_classic() + labs(title = "Other Theft", x= ' ' , y = ' ' ),
  
  ggplot(tfv, aes(x = YEAR)) + geom_line(stat = 'count', color = 'dark green') + geom_point(stat = 'count', color = 'dark green') +  theme_classic() + labs(title = 'Theft from Vehicle', x= ' ' , y = ' ' ),
  
  ggplot(tb, aes(x = YEAR)) + geom_line(stat = 'count', color = 'orange') + geom_point(stat = 'count', color = 'orange') +  theme_classic() + labs(title = 'Theft of Bicycle', x= ' ' , y = ' ' ),
  
  ggplot(tv, aes(x = YEAR)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = 'Theft of Vehicle', x= ' ' , y = ' ' ),
  
  ggplot(vcf, aes(x = YEAR)) + geom_line(stat = 'count', color = 'light blue') + geom_point(stat = 'count', color = 'light blue') +  theme_classic() + labs(title = 'Vehicle Collision or Pedestrian Struck(with Fatality)', x= ' ' , y = ' ' ),
  
  ggplot(vci, aes(x = YEAR)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = 'Vehicle Collision or Pedestrian Struck(with Injury)', x= ' ' , y = ' ' )
)

5.1.2 Month

  • Group by patterns over months

  • Group 1 ( random pattern = up&down all year round): ‘Break and Enter Commercial’, ‘Vehicle Collision or Pedestrian Struck (with Injury)’, ‘Break and Enter Residential/Other’

  • Group 2 ( increase toward summer & decrease toward winter): ‘Offence Against a Person’, ‘Theft from Vehicle’, ‘Theft of Bicycle’, ‘Theft of Vehicle’,

  • Group 3 (decrease toward end of year): “Other Theft”, ‘Mischief’

  • Group 4 (too small sample size, random pattern) : ‘Homicide’, ‘Vehicle Collision or Pedestrian Struck (with Fatality)’

  • Most types of crime decline december

ggplot(crime, aes(x = MONTH, color = TYPE)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Number by Crime Types by Month", x = "Year", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

#-------------------------------------#

grid.arrange(
  ggplot(comm, aes(x = MONTH)) + geom_line(stat = 'count', color = 'blue') + geom_point(stat = 'count', color = 'blue') + theme_classic() + labs(title = "Break and Enter Commercial" , x= ' ' , y = ' '), 
 
  ggplot(resit, aes(x = MONTH)) + geom_line(stat = 'count', color = 'red') + geom_point(stat = 'count', color = 'red') + theme_classic() + labs(title = "Break and Enter Residential/Other", x= ' ' , y = ' ' ), 
  
  ggplot(homi, aes(x = MONTH)) + geom_line(stat = 'count', color = ' green') + geom_point(stat = 'count', color = 'green') +  theme_classic() + labs(title = "Homicide", x= ' ' , y = ' ' ),
  
  ggplot(mis, aes(x = MONTH)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = "Mischief", x= ' ' , y = ' ' ),
  
  ggplot(off, aes(x = MONTH)) + geom_line(stat = 'count', color = 'dark green') + geom_point(stat = 'count', color = 'dark green') +  theme_classic() + labs(title = "Offence Against a Person", x= ' ' , y = ' ' ),
  
  ggplot(ot, aes(x = MONTH)) + geom_line(stat = 'count', color = 'pink') + geom_point(stat = 'count', color = 'pink') +  theme_classic() + labs(title = "Other Theft", x= ' ' , y = ' ' ),
  
  ggplot(tfv, aes(x = MONTH)) + geom_line(stat = 'count', color = 'dark green') + geom_point(stat = 'count', color = 'dark green') +  theme_classic() + labs(title = 'Theft from Vehicle', x= ' ' , y = ' ' ),
  
  ggplot(tb, aes(x = MONTH)) + geom_line(stat = 'count', color = 'orange') + geom_point(stat = 'count', color = 'orange') +  theme_classic() + labs(title = 'Theft of Bicycle', x= ' ' , y = ' ' ),
  
  ggplot(tv, aes(x = MONTH)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = 'Theft of Vehicle', x= ' ' , y = ' ' ),
  
  ggplot(vcf, aes(x = MONTH)) + geom_line(stat = 'count', color = 'light blue') + geom_point(stat = 'count', color = 'light blue') +  theme_classic() + labs(title = 'Vehicle Collision or Pedestrian Struck(with Fatality)', x= ' ' , y = ' ' ),
  
  ggplot(vci, aes(x = MONTH)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = 'Vehicle Collision or Pedestrian Struck(with Injury)', x= ' ' , y = ' ' )
)

5.1.3 Days

  • Group by patterns over days

  • Group 1 ( high on 1st & 15th - sign of organized crime): ‘Break and Enter Commercial’, ‘Mischief’, ‘Theft of Bicycle’

  • Group 2 ( Gradually decrease toward end of month): ‘Break and Enter Residential/Other’, ‘Other Theft’, ‘Theft from Vehicle’, ‘Theft of Vehicle’

  • Group 3 ( flat with minor up&down ): ‘Offence Against a Person’, ‘Vehicle Collision or Pedestrian Struck (with Injury)’

  • Group 4 ( too small number, no pattern): ‘Homicide’, ‘Vehicle Collision or Pedestrian Struck (with Fatality)’

# Number by Crime Types by Day
ggplot(crime, aes(x = DAY, color = TYPE)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Numbers by Crime Types by Day", x = "Day", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "red"))

grid.arrange(
  ggplot(comm, aes(x = DAY)) + geom_line(stat = 'count', color = 'blue') + geom_point(stat = 'count', color = 'blue') + theme_classic() + labs(title = "Break and Enter Commercial" , x= ' ' , y = ' '), 
 
  ggplot(resit, aes(x = DAY)) + geom_line(stat = 'count', color = 'red') + geom_point(stat = 'count', color = 'red') + theme_classic() + labs(title = "Break and Enter Residential/Other", x= ' ' , y = ' ' ), 
  
  ggplot(homi, aes(x = DAY)) + geom_line(stat = 'count', color = ' green') + geom_point(stat = 'count', color = 'green') +  theme_classic() + labs(title = "Homicide", x= ' ' , y = ' ' ),
  
  ggplot(mis, aes(x = DAY)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = "Mischief", x= ' ' , y = ' ' ),
  
  ggplot(off, aes(x = DAY)) + geom_line(stat = 'count', color = 'dark green') + geom_point(stat = 'count', color = 'dark green') +  theme_classic() + labs(title = "Offence Against a Person", x= ' ' , y = ' ' ),
  
  ggplot(ot, aes(x = DAY)) + geom_line(stat = 'count', color = 'pink') + geom_point(stat = 'count', color = 'pink') +  theme_classic() + labs(title = "Other Theft", x= ' ' , y = ' ' ),
  
  ggplot(tfv, aes(x = DAY)) + geom_line(stat = 'count', color = 'dark green') + geom_point(stat = 'count', color = 'dark green') +  theme_classic() + labs(title = 'Theft from Vehicle', x= ' ' , y = ' ' ),
  
  ggplot(tb, aes(x = DAY)) + geom_line(stat = 'count', color = 'orange') + geom_point(stat = 'count', color = 'orange') +  theme_classic() + labs(title = 'Theft of Bicycle', x= ' ' , y = ' ' ),
  
  ggplot(tv, aes(x = DAY)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = 'Theft of Vehicle', x= ' ' , y = ' ' ),
  
  ggplot(vcf, aes(x = DAY)) + geom_line(stat = 'count', color = 'light blue') + geom_point(stat = 'count', color = 'light blue') +  theme_classic() + labs(title = 'Vehicle Collision or Pedestrian Struck(with Fatality)', x= ' ' , y = ' ' ),
  
  ggplot(vci, aes(x = DAY)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = 'Vehicle Collision or Pedestrian Struck(with Injury)', x= ' ' , y = ' ' )
)

5.1.4 Hours

  • Group by patterns over hours

  • Group 1 (High early morning & evening): ‘Break and Enter Commercial’

  • Group 2 ( Low early morning, increase toward 18 hour & high until 24 hour): ‘Break and Enter Residential/Other’, ‘Mischief’, ‘Theft from Vehicle’, ‘Theft of Bicycle’, ‘Theft of Vehicle’

  • Group 3 ( Peak at 18 hour): ‘Other Theft’, ‘Vehicle Collision or Pedestrian Struck (with Injury)’

  • Group 4 (info N/A & too small number - no pattern): ‘Homicide’, ‘Vehicle Collision or Pedestrian Struck (with Fatality)’, ‘Offence Against a Person’

# Number by Crime Types by Hour
ggplot(crime, aes(x = HOUR, color = TYPE)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Numbers by Crime Types by Hour", x = "Hour", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))
## Warning: Removed 54362 rows containing non-finite values (stat_count).

grid.arrange(
  ggplot(comm, aes(x = HOUR)) + geom_line(stat = 'count', color = 'blue') + geom_point(stat = 'count', color = 'blue') + theme_classic() + labs(title = "Break and Enter Commercial" , x= ' ' , y = ' '), 
 
  ggplot(resit, aes(x = HOUR)) + geom_line(stat = 'count', color = 'red') + geom_point(stat = 'count', color = 'red') + theme_classic() + labs(title = "Break and Enter Residential/Other", x= ' ' , y = ' ' ), 
  
  ggplot(homi, aes(x = HOUR)) + geom_line(stat = 'count', color = ' green') + geom_point(stat = 'count', color = 'green') +  theme_classic() + labs(title = "Homicide", x= ' ' , y = ' ' ),
  
  ggplot(mis, aes(x = HOUR)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = "Mischief", x= ' ' , y = ' ' ),
  
  ggplot(off, aes(x = HOUR)) + geom_line(stat = 'count', color = 'dark green') + geom_point(stat = 'count', color = 'dark green') +  theme_classic() + labs(title = "Offence Against a Person", x= ' ' , y = ' ' ),
  
  ggplot(ot, aes(x = HOUR)) + geom_line(stat = 'count', color = 'pink') + geom_point(stat = 'count', color = 'pink') +  theme_classic() + labs(title = "Other Theft", x= ' ' , y = ' ' ),
  
  ggplot(tfv, aes(x = HOUR)) + geom_line(stat = 'count', color = 'dark green') + geom_point(stat = 'count', color = 'dark green') +  theme_classic() + labs(title = 'Theft from Vehicle', x= ' ' , y = ' ' ),
  
  ggplot(tb, aes(x = HOUR)) + geom_line(stat = 'count', color = 'orange') + geom_point(stat = 'count', color = 'orange') +  theme_classic() + labs(title = 'Theft of Bicycle', x= ' ' , y = ' ' ),
  
  ggplot(tv, aes(x = HOUR)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = 'Theft of Vehicle', x= ' ' , y = ' ' ),
  
  ggplot(vcf, aes(x = HOUR)) + geom_line(stat = 'count', color = 'light blue') + geom_point(stat = 'count', color = 'light blue') +  theme_classic() + labs(title = 'Vehicle Collision or Pedestrian Struck(with Fatality)', x= ' ' , y = ' ' ),
  
  ggplot(vci, aes(x = HOUR)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = 'Vehicle Collision or Pedestrian Struck(with Injury)', x= ' ' , y = ' ' )
)
## Warning: Removed 220 rows containing non-finite values (stat_count).
## Warning: Removed 220 rows containing non-finite values (stat_count).
## Warning: Removed 54142 rows containing non-finite values (stat_count).

## Warning: Removed 54142 rows containing non-finite values (stat_count).

5.1.5 Minutes

  • Group by patterns over hours

  • Group 1(Most cases occured at 0 & 30 minute - organized crime): ‘Break and Enter Commercial’, ‘Break and Enter Residential/Other’, ‘Mischief’, ‘Theft from Vehicle’, ‘Theft of Bicycle’, ‘Theft of Vehicle’

  • Group 2( Up&down with peaks at 0 and 30 min - mix of disorganized/organized crime): ‘Other Theft’, ‘Vehicle Collision or Pedestrian Struck (with Injury)’

  • Group 3(No info or too small number of incidents): ‘Homicide’, ‘Vehicle Collision or Pedestrian Struck (with Fatality)’ , ‘Offence Against a Person’

  • Other Theft and Vehicle Collision or Pedestrian Struck(with Injury) - mix of organized and disorganized

  • Vehicle Collision or Pedestrian Struck(with Fatality) - randomly distributed, it means most cases are not planned - disorganized

  • All others - share a specific pattern of having four quarters with majority occurred at 0 and 30 minute - organized

# Number by Crime Types by Minute
ggplot(crime, aes(x = MINUTE, color = TYPE)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Numbers by Crime Types by Minute", x = "Minute", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "red "))
## Warning: Removed 54362 rows containing non-finite values (stat_count).

grid.arrange(
  ggplot(comm, aes(x = MINUTE)) + geom_line(stat = 'count', color = 'blue') + geom_point(stat = 'count', color = 'blue') + theme_classic() + labs(title = "Break and Enter Commercial" , x= ' ' , y = ' '), 
 
  ggplot(resit, aes(x = MINUTE)) + geom_line(stat = 'count', color = 'red') + geom_point(stat = 'count', color = 'red') + theme_classic() + labs(title = "Break and Enter Residential/Other", x= ' ' , y = ' ' ), 
  
  ggplot(homi, aes(x = MINUTE)) + geom_line(stat = 'count', color = ' green') + geom_point(stat = 'count', color = 'green') +  theme_classic() + labs(title = "Homicide", x= ' ' , y = ' ' ),
  
  ggplot(mis, aes(x = MINUTE)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = "Mischief", x= ' ' , y = ' ' ),
  
  ggplot(off, aes(x = MINUTE)) + geom_line(stat = 'count', color = 'dark green') + geom_point(stat = 'count', color = 'dark green') +  theme_classic() + labs(title = "Offence Against a Person", x= ' ' , y = ' ' ),
  
  ggplot(ot, aes(x = MINUTE)) + geom_line(stat = 'count', color = 'pink') + geom_point(stat = 'count', color = 'pink') +  theme_classic() + labs(title = "Other Theft", x= ' ' , y = ' ' ),
  
  ggplot(tfv, aes(x = MINUTE)) + geom_line(stat = 'count', color = 'dark green') + geom_point(stat = 'count', color = 'dark green') +  theme_classic() + labs(title = 'Theft from Vehicle', x= ' ' , y = ' ' ),
  
  ggplot(tb, aes(x = MINUTE)) + geom_line(stat = 'count', color = 'orange') + geom_point(stat = 'count', color = 'orange') +  theme_classic() + labs(title = 'Theft of Bicycle', x= ' ' , y = ' ' ),
  
  ggplot(tv, aes(x = MINUTE)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = 'Theft of Vehicle', x= ' ' , y = ' ' ),
  
  ggplot(vcf, aes(x = MINUTE)) + geom_line(stat = 'count', color = 'light blue') + geom_point(stat = 'count', color = 'light blue') +  theme_classic() + labs(title = 'Vehicle Collision or Pedestrian Struck(with Fatality)', x= ' ' , y = ' ' ),
  
  ggplot(vci, aes(x = MINUTE)) + geom_line(stat = 'count', color = 'purple') + geom_point(stat = 'count', color = 'purple') +  theme_classic() + labs(title = 'Vehicle Collision or Pedestrian Struck(with Injury)', x= ' ' , y = ' ' )
)
## Warning: Removed 220 rows containing non-finite values (stat_count).
## Warning: Removed 220 rows containing non-finite values (stat_count).
## Warning: Removed 54142 rows containing non-finite values (stat_count).

## Warning: Removed 54142 rows containing non-finite values (stat_count).

  • It is difficult to group by type of crimes based on patterns over time. Instead, I group them into four as Break_Enter, Theft, Vehicle_crime, and Other

5.1.6 Regroup and create new type of crime variable for detailed EDA

# Create new type variable


#crime$CLASS <- ifelse(crime$TYPE == c('Break and Enter Commercial', 'Mischief', 'Theft from Vehicle'), 'Class1', ifelse(crime$TYPE == c('Theft of Bicycle','Theft of Vehicle', 'Other Theft', 'Vehicle Collision or Pedestrian Struck (with Injury)', 'Offence Against a Person', 'Break and Enter Residential/Other'), 'Class2', 'Class3'))


crime$CLASS <- ifelse(crime$TYPE %in% c('Break and Enter Commercial','Break and Enter Residential/Other'), 'B&E', ifelse(crime$TYPE %in% c('Theft from Vehicle', 'Theft of Bicycle', 'Theft of Vehicle', 'Other Theft'), 'Theft', ifelse(crime$TYPE %in% c('Vehicle Collision or Pedestrian Struck (with Injury)','Vehicle Collision or Pedestrian Struck (with Fatality)'), 'Vehicle Crime', 'Other')))
#YEAR
crime %>%
  ggplot(aes(x = YEAR, color = CLASS)) + geom_line(stat = 'count') + theme_classic() + labs(title = "Numbers by Crime Class by Year", x = "Year", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

#MONTH
crime  %>%
  ggplot(aes(x = MONTH, color = CLASS)) + geom_line(stat = 'count') + theme_classic() + labs(title = "Numbers by Crime Class by Month", x = "Month", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

#DAY
crime  %>%
  ggplot(aes(x = DAY, color = CLASS)) + geom_line(stat = 'count') + theme_classic() + labs(title = "Numbers by Crime Class by Day", x = "Day", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

#HOUR (N/A for 'Offence Against a Person' and 'Homicide)
crime  %>%
  ggplot(aes(x = HOUR, color = CLASS)) + geom_line(stat = 'count') + theme_classic() + labs(title = "Numbers by Crime Class by Hour", x = "Hour", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))
## Warning: Removed 54362 rows containing non-finite values (stat_count).

#MINUTE (N/A for 'Offence Against a Person' and 'Homicide)
crime  %>%
  ggplot(aes(x = MINUTE, color = CLASS)) + geom_line(stat = 'count') + theme_classic() + labs(title = "Numbers by Crime Class by Minute", x = "Minute", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))
## Warning: Removed 54362 rows containing non-finite values (stat_count).

* Year - Theft crime rate decreases until 2011, then it sharply increases again - types of Break and Enter crimes gradually decreases over the years - Vehicle collision crimes, although it is slow and gradual, decreases - The rate of other crimes does not change

  • Month
    • Theft types of crimes increae during summer and decline winter. It seems it is affecting the trend of total crime rate over years as it is the most frequent types of crimes
    • The rest of types of crime stay still with small ups and downs
  • Day
    • Other than Vehicle associated crimes, the rest shares the trend of high crime rate on 1st & 15th day of month.
    • Theft crimes decreases toward end of month
    • Note that homicide rate does not share the same pattern with other types of crimes and it is unpredictable
  • Hour
    • The grouping model of class does not represent the trend by type of crimes in hour level
    • Rate of theft drops significantly after 24 hour and increaes after 5 am until 18 hour and stays high until 24 hour
    • whilte break and enter types of crimes peak at 18hour, the graph of break and enter commercial rate is high during early morning (0 - 5 hour) and stays at low level until 15 hour
    • Vehicle associated crimes increaes during late afternoon
    • Other types of crimes share the same pattern with Theft
  • Minute
    • Other than Vehicle collision crime types, all have high number of crimes at 0 and 30 minute

5.2 Crime rates by Neighbourhood over time

### NEIGHBOURHOOD ###

# Crime incidents by Neighbourhood by Year
ggplot(crime, aes(x = YEAR, color = NEIGHBOURHOOD)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Numbers by Crime Types by Year", x = "Year", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

# Crime incidents by Neighbourhood by Month
ggplot(crime, aes(x = MONTH, color = NEIGHBOURHOOD)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Numbers by Crime Types by Month", x = "Month", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

# Crime incidents by Neighbourhood by Day
ggplot(crime, aes(x = DAY, color = NEIGHBOURHOOD)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Numbers by Crime Types by Day", x = "Day", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

# Crime incidents by Neighbourhood by Hour
ggplot(crime, aes(x = HOUR, color = NEIGHBOURHOOD)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Numbers by Crime Types by Hour", x = "Hour", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))
## Warning: Removed 54362 rows containing non-finite values (stat_count).

# Crime incidents by Neighbourhood by Minute
ggplot(crime, aes(x = MINUTE, color = NEIGHBOURHOOD)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Numbers by Crime Types by Minute", x = "Minute", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))
## Warning: Removed 54362 rows containing non-finite values (stat_count).

- Split neighbourhoods into two groups

  • Group 1(Top 10 neighbourhood with highest crime rate): Central Business District, West End, Fairview, Mount Pleasant, Grandview-Woodland, Renfrew-Collingwood, Kitsilano, Kensington-Cedar Cottage, Strathcona, Hastings-Sunrise, Sunset

  • Group 2(Others): Marpole, Riley Park, Victoria-Fraserview, Killarney, Oakridge, Dunbar-Southlands, Kerrisdale, Arbutus Ridge, West Point Grey, Shaughnessy, South Cambie, Stanley Park, Musqueam

# Create new neighbourhood variable
#crime <- mutate(crime, COMMUNITY = as.factor(
    #ifelse(NEIGHBOURHOOD == c('Central Business District', 'West End', 'Fairview', 'Mount Pleasant', 'Grandview-Woodland', 'Renfrew-Collingwood',  'Kitsilano', 'Kensington-Cedar Cottage', 'Strathcona', 'Hastings-Sunrise', 'Sunset'), "Unsafe_comm", 'safe_comm' )))
           #ifelse(NEIGHBOURHOOD == c('Marpole', 'Riley Park', 'Victoria-Fraserview', 'Killarney', 'Oakridge', 'Dunbar-Southlands', 'Kerrisdale', 'Arbutus Ridge', 'West Point Grey', 'Shaughnessy', 'South Cambie', 'Stanley Park', 'Musqueam'), 'safe_comm', "N/A"))))
  • TOP 11 neighbourhood with most crimes

5.2.1 Group as two for neighbourhoods (Safe Community, Unsafe Community)

z <- count(crime, NEIGHBOURHOOD) %>% arrange(desc(n))
view(z)
  • Split neighbourhoods into two groups

  • 56624 cases missing location information and they are labeled as “OFFSET TO PROTECT PRIVACY”

  • Group 1(Top 8, number of crimes > 24000 - a large drop in crime numbers by 4000 between Kensington and Strathcona): neighbourhood with highest crime rate): Central Business District, West End, Fairview, Mount Pleasant, Grandview-Woodland, Renfrew-Collingwood, Kitsilano, Kensington-Cedar Cottage

  • Group 2(Others): Strathcona, Hastings-Sunrise, Sunset, Marpole, Riley Park, Victoria-Fraserview, Killarney, Oakridge, Dunbar-Southlands, Kerrisdale, Arbutus Ridge, West Point Grey, Shaughnessy, South Cambie, Stanley Park, Musqueam

# Create new neighbourhood variable
#crime <- mutate(crime, COMMUNITY = as.factor(
 #   ifelse(NEIGHBOURHOOD %in% c('Central Business District', 'West End', 'Fairview', 'Mount Pleasant', 'Grandview-Woodland', 'Renfrew-Collingwood',    'Kitsilano', 'Kensington-Cedar Cottage', 'Strathcona', 'Hastings-Sunrise', 'Sunset'), "Unsafe_comm", 'safe_comm',  ifelse(NEIGHBOURHOOD %in% c('Marpole', 'Riley Park', 'Victoria-Fraserview', 'Killarney', 'Oakridge', 'Dunbar-Southlands', 'Kerrisdale', 'Arbutus Ridge', 'West Point Grey', 'Shaughnessy', 'South Cambie', 'Stanley Park', 'Musqueam'), 'safe_comm', "N/A"))))
           

#crime$COMM <- ifelse(crime$NEIGHBOURHOOD %in% c('Central Business District', 'West End', 'Fairview', 'Mount Pleasant', 'Grandview-Woodland', 'Renfrew-Collingwood',    'Kitsilano', 'Kensington-Cedar Cottage', 'Strathcona', 'Hastings-Sunrise', 'Sunset'), "Unsafe_comm", ifelse(NEIGHBOURHOOD %in% c('Marpole', 'Riley Park', 'Victoria-Fraserview', 'Killarney', 'Oakridge', 'Dunbar-Southlands', 'Kerrisdale', 'Arbutus Ridge', 'West Point Grey', 'Shaughnessy', 'South Cambie', 'Stanley Park', 'Musqueam'), 'safe_comm', "N/A"))

crime$COMM <- ifelse(crime$NEIGHBOURHOOD %in% c('Central Business District', 'West End', 'Fairview', 'Mount Pleasant', 'Grandview-Woodland', 'Renfrew-Collingwood',     'Kitsilano', 'Kensington-Cedar Cottage'), "Unsafe_comm", ifelse(crime$NEIGHBOURHOOD %in% '', 'N/A','Safe_comm'))

#'Strathcona', 'Hastings-Sunrise', 'Sunset'

5.3 Crime rate by Community over time

# Crime incidents by Community by Year
ggplot(crime, aes(x = YEAR, color = COMM)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Crime rates by Community by Year", x = "Year", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

# Crime incidents by Community by Month
ggplot(crime, aes(x = MONTH, color = COMM)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Crime rates by Community by Month", x = "Month", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

# Crime incidents by Community by Day
ggplot(crime, aes(x = DAY, color = COMM)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Crime rates by Community by Day", x = "Day", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

# Crime incidents by Community by Hour
ggplot(crime, aes(x = HOUR, color = COMM)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Crime rates by Community by Hour", x = "Hour", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))
## Warning: Removed 54362 rows containing non-finite values (stat_count).

# Crime incidents by Community by Minute
ggplot(crime, aes(x = MINUTE, color = COMM)) + 
  geom_line(stat = 'count') + 
  #geom_point(stat = 'count') + 
  theme_classic() + labs(title = "Crime rates by Community by Minute", x = "Minute", y = "Number of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))
## Warning: Removed 54362 rows containing non-finite values (stat_count).

  • CLASS and COMM show similar pattern over time. Does it mean the most frequent type of crimes occur in unsafe neighbourhood?

6 Detailed EDA

6.1 Percentage of Theft type crimes higher in unsafe communities?

# Remove rows with N/A in COMM column
# Create graphs for percentage of crimes by CLASS
final_graph1 <- crime %>% filter(COMM %in% c("Safe_comm", "Unsafe_comm")) %>% ggplot(aes(x= CLASS,  group=COMM)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = 0) +
    labs(y = "Percent", fill="CLASS") + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title = element_text(colour = "blue")) +
    facet_grid(~COMM) +
    scale_y_continuous(labels=percent)
#ggsave("Theft_unsafe_comm.png", plot = final_graph1)
  • In Unsafe communities, Theft type of crimes are more common while Break and Enter crimes is 9% higher in total crime incidents in Safe communities

6.1.1 Suggestions:

  • 1: Theft type of crime occurs more frequently in unsafe neighbourhoods. Since Theft crimes are more likely to happen impulsively without plan ahead, improving general public security needs to be priortized in those communities by such as increasing number of patrols
    • 2: Break and Enter crimes are relatively common in safe neighbourhoods. As such type of crimes are generally organized and planned ahead, enhancing survelliance system for places vulnerable to the type of crimes will help to lower the crime rate among the safe neighbourhoods

6.2 Is Vancouver getting safer? If so, what is the leading cause for it?

#group_by(Company, flavor) %>% summarise(medPrice = median(price, na.rm=FALSE)) %>% 
 # ggplot(aes(x = flavor, y=medPrice, fill = Company)) + geom_histogram(position = "dodge", stat = "identity")

#crime %>% group_by(MONTH) %>% summarize(mean_MONTH = mean(MONTH, na.rm =FALSE)) %>% ggplot(x = YEAR, y = mean_MONTH) + geom_line(stat = 'count')
grid.arrange(
  crime %>% ggplot(aes(x = YEAR)) + geom_line(stat = 'count')+ ylim(0, 50000) + theme_classic() + labs(title = "Total Annual Crime Rate", y = "Number of Crimes" , x = "") + theme(plot.title = element_text(size=10, colour = "blue"),  axis.title = element_text(size = 10)),
  
  crime %>% filter(!CLASS %in% "Theft") %>% ggplot(aes(x = YEAR)) + geom_line(stat = 'count')+ ylim(0, 50000) + theme_classic() + labs(title = "Annual Crime Rate Without Theft", y = "Number of Crimes" , x = "") + theme(plot.title = element_text(size=10, colour = "red"),  axis.title = element_text(size = 10)),
  

crime %>% filter(CLASS %in% "Theft") %>% ggplot(aes(x = YEAR, color = CLASS, fill = CLASS)) + geom_line(stat = 'count') + ylim(0, 50000) +theme_classic() + theme(
    legend.position = c(.99, .99),
    legend.justification = c("right", "top"),
    legend.box.just = "right",
    legend.margin = margin(1, 1, 1, 1),
    legend.title=element_text(size=5),
    legend.text=element_text(size=5),
    legend.key.size = unit(0.05, "cm")
    ) + labs(title = "Theft Annual Crime Rate", y = "Number of Crimes" , x = "") + theme(plot.title = element_text(size=10, colour = "blue"),  axis.title = element_text(size = 10)),

crime %>% filter(!CLASS %in% "Theft") %>% ggplot(aes(x = YEAR, color = CLASS, fill = CLASS)) + geom_line(stat = 'count') + ylim(0, 30000) +theme_classic() +  
theme(
    legend.position = c(.99, .99),
    legend.justification = c("right", "top"),
    legend.box.just = "right",
    legend.margin = margin(1, 1, 1, 1),
    legend.title=element_text(size=5),
    legend.text=element_text(size=5),
    legend.key.size = unit(0.05, "cm"),
    plot.title = element_text(size=10, colour = "red"),
    axis.title = element_text(size = 10)) +
  labs(title = "Annual Crime Rate of Other Crime Classes", y = "Number of Crimes" , x = ""),
top = "Annual Crime Rate Changes Over Years: Theft vs. Others"
)

#ggsave("Theft_most_impactable.png", plot = final_graph2)
  • As the graphs above shows, the annual crime number decreaes until 2011 and increases after 2013. Theft crimes share the same trend with the total annual crime trend over the years. On contrary, Break and Enter crimes gradually decreases over the years and rates of the other classes of crimes does not change much. This tells that reducing theft type of crimes is the priority while it indicates decline of organized crimes

7 Statistical EDA

7.1 Hypothesis 1: Theft crimes more frequent in unsafe neighbourhoods

  • Null Hypothesis: Rate of Theft crimes is the same in both unsafe and safe neighbourhoods
    • Alternative Hypothesis: Rate of Theft crimes is higher in unsafe neighbourhoods ### Graphical analysis for Hypothesis 1
# Create column Theft_Crime vs Non_Theft_Crime
crime$Tft <- ifelse(crime$TYPE %in% c('Theft from Vehicle', 'Theft of Bicycle', 'Theft of Vehicle', 'Other Theft'), 'Theft_Crime', 'Non_Theft_Crime')

crime$danger <- ifelse(crime$NEIGHBOURHOOD %in% c('Central Business District', 'West End', 'Fairview', 'Mount Pleasant', 'Grandview-Woodland', 'Renfrew-Collingwood',   'Kitsilano', 'Kensington-Cedar Cottage'), 'Dangerous_comm', 'Undangerous_comm')

crime_test <- crime %>% filter(!COMM %in% "N/A" ) 
table(crime$Tft, crime$danger)
##                  
##                   Dangerous_comm Undangerous_comm
##   Non_Theft_Crime         114396           127241
##   Theft_Crime             206181            82834
ggplot(crime_test,aes(x = danger,fill = Tft)) + 
    geom_bar(position = "fill") + theme_classic() + labs(title = "Proportion of Theft Crimes in Safe Community vs Unsafe Community", x = "", y = "Percentage" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue")) + scale_y_continuous(labels=percent)

### Proportion test

  • The p-value of the proportion test is <0.0000000000000002, it rejects the null hypothesis. The test result informs that Theft crimes are more frequent in the neighbourhoods with high crime rates.
prop.test(table(crime_test$Tft, crime_test$danger), correct= FALSE)
## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  table(crime_test$Tft, crime_test$danger)
## X-squared = 5877.3, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.1093533 -0.1038444
## sample estimates:
##    prop 1    prop 2 
## 0.6117368 0.7183356

7.2 Hypothesis 2: Crime rate of Vancouver is decreasing

  • Null Hypotheiss: Rate of crime is not changing or increasing over years
    • Alternative Hypothesis: Rate of crime is decreasing over years

7.2.1 Graphical analysis for Hypothesis 2

  • Drop YEAR 2017 as the data contains crime record only up to July for Year 2017
df_sum <- crime %>% filter(!YEAR %in% '2017') %>%
  group_by(YEAR) %>% tally()

ggplot(df_sum,aes(x =YEAR, y = n)) + geom_line() + geom_smooth(method = 'lm', color = 'red') + theme_classic() + labs(title = "Number of Crimes vs Year", subtitle = "Number of crime incidents goes down as year goes by", x = "Year", y = "Numer of Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

7.2.2 t-test

  • The t-test rejects the null hypotheis with the p-value of 4.891e-10 (fairly close to 0). This indicates that the rate of crime in Vancouver is decreasing over years. However, would it be true in all types of crimes? Let’s see the Hypothesis 3.
t.test(df_sum$YEAR, df_sum$n, conf.level = 0.95)
## 
##  Welch Two Sample t-test
## 
## data:  df_sum$YEAR and df_sum$n
## t = -17.601, df = 13, p-value = 1.892e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -38867.76 -30369.67
## sample estimates:
## mean of x mean of y 
##   2009.50  36628.21

7.3 Hypothesis 3: Theft crime is the most significant factor on annual crime rate behaviour

  • Null Hypothesis: The total crime rate change over years is not same as Theft crime rate changes
    • Alternative Hypothesis: The total crime rate change over years is same as Theft crime rate changes

7.3.1 Graphical analysis for Hypothesis 3 - Regression of Theft Class rate over years

df_sum_Theft <- crime %>% filter(CLASS %in% 'Theft') %>% filter(!YEAR %in% '2017') %>%
  group_by(YEAR) %>% tally()

df_sum_not_Theft <- crime %>% filter(!CLASS %in% 'Theft') %>% filter(!YEAR %in% '2017') %>%
  group_by(YEAR) %>% tally()


ggplot(df_sum_Theft, aes(x = YEAR, y = n)) + geom_smooth(method = 'lm') + ylim(10000, 30000) + theme_classic() + labs(title = "Number of Theft Crimes vs Year", subtitle = "Number of Theft crime incidents declines significantly as year passes", x = "Year", y = "Number of Theft Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

ggplot(df_sum_not_Theft, aes(x = YEAR, y = n)) + geom_smooth(method = 'lm') + ylim(10000, 30000) + theme_classic() + labs(title = "Number of Theft Crimes vs Year", subtitle = "Number of Theft crime incidents goes down at moderate degree as year passes", x = "Year", y = "Number of Theft Crimes" ) + 
  theme(axis.title = element_text(colour = "dark green"), title = element_text(colour = "blue"))

7.3.2 t-test

7.3.2.1 Conduct two hypothesis tests: 1. Rate change of Theft CLASS crime vs Total crime rate change over years & 2. Rate change of Non Theft CLASS crime vs Total crime rate change over years

  • Although both tests reject the null hypothesis, the test on the Theft CLASS crime trend has much lower p-value with 5.537e-12. Hence, it is more statistically significant and it supports that the total crime rate of Vancouver behaves much similar to the rate of Theft CLASS crimes over years
#Total_crime <- with(df_sum$YEAR, df_sum$n)
#Theft_crime <- with(df_sum$YEAR, df_sum$n)

t.test(df_sum$n, df_sum_Theft$n,paired = T, conf.level = 0.95)
## 
##  Paired t-test
## 
## data:  df_sum$n and df_sum_Theft$n
## t = 23.283, df = 13, p-value = 5.537e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  15186.03 18292.40
## sample estimates:
## mean of the differences 
##                16739.21
t.test(df_sum$n, df_sum_not_Theft$n,paired = T, conf.level = 0.95)
## 
##  Paired t-test
## 
## data:  df_sum$n and df_sum_not_Theft$n
## t = 15.187, df = 13, p-value = 1.184e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  17059.81 22718.19
## sample estimates:
## mean of the differences 
##                   19889

8 Mapping Visual of Vancouver Crime

  • Each section has 3 types of maps including heat map, and RgoogleMap

8.1 Theft from Vehicle type crimes Map of Year 2003, 2011, and 2016

#crime_n <- crime %>% filter(HUNDRED_BLOCK %in% "OFFSET TO PROTECT PRIVACY")
## Theft from Vehicle Map of Year 2003

crime_tv_2003 <- crime %>% filter(YEAR %in% "2003") %>% filter(TYPE %in% "Theft from Vehicle")

# Read in the data
rawdata1 <- data.frame(as.numeric(crime_tv_2003$Longitude), as.numeric(crime_tv_2003$Latitude))
names(rawdata1) <- c("lon", "lat")
data1 <- as.matrix(rawdata1)

# Rotate the lat-lon coordinates using a rotation matrix
# Trial and error lead to pi/15.0 = 12 degrees
theta = pi/15.0
m = matrix(c(cos(theta), sin(theta), -sin(theta), cos(theta)), nrow=2)
data1 <- as.matrix(data1) %*% m

# Reproduce William's original map
#par(bg='black')
#plot(data, cex=0.1, col="white", pch=16)

# Create heatmap with kde2d and overplot
k <- kde2d(data1[,1], data1[,2], n=500)
a <- kde2d(data1[,1], data1[,2], n=500)
# Intensity from green to red
cols <- rev(colorRampPalette(brewer.pal(8, 'BrBG'))(10000))
#par(bg='white')
#image(k, col=cols, xaxt='n', yaxt='n')
#points(data1, cex=0.1, pch=16)

# Mapping via RgoogleMaps
# Find map center and get map
center <- rev(sapply(rawdata1, mean))
map <- GetMap(center=center, zoom=12)
## sleptTotal= 0
# Translate original data
coords1 <- LatLon2XY.centered(map, rawdata1$lat, rawdata1$lon, 11)
coords1 <- data.frame(coords1)

# Rerun heatmap
#k2 <- kde2d(coords$newX, coords$newY, n=500)

# Create exponential transparency vector and add
alpha <- seq.int(0.5, 0.95, length.out=100)
alpha <- exp(alpha^6-1)
cols2 <- AddAlpha(cols, alpha)

# Plot
PlotOnStaticMap(map)
## [1] "Caution: map type is OpenStreetMap. Until we find the correct projection algorithm, we treat lat/lon like planar coordinates and set TrueProj = FALSE."
#image(k2, col=cols2, add=T)
points(coords1$newX, coords1$newY, pch=16, cex=0.3) 

#-----------------------------------------------------#

crime_tv_2011 <- crime %>% filter(YEAR %in% "2011") %>% filter(TYPE %in% "Theft from Vehicle")

# Read in the data
rawdata2 <- data.frame(as.numeric(crime_tv_2011$Longitude), as.numeric(crime_tv_2011$Latitude))
names(rawdata2) <- c("lon", "lat")
data2 <- as.matrix(rawdata2)

# Rotate the lat-lon coordinates using a rotation matrix
# Trial and error lead to pi/15.0 = 12 degrees
theta = pi/15.0
m = matrix(c(cos(theta), sin(theta), -sin(theta), cos(theta)), nrow=2)
data2 <- as.matrix(data2) %*% m

# Reproduce William's original map
#par(bg='black')
#plot(data, cex=0.1, col="white", pch=16)

# Create heatmap with kde2d and overplot
k <- kde2d(data2[,1], data2[,2], n=500)
b <- kde2d(data2[,1], data2[,2], n=500)
# Intensity from green to red
cols <- rev(colorRampPalette(brewer.pal(8, 'BrBG'))(10000))
#par(bg='white')
#image(k, col=cols, xaxt='n', yaxt='n')
#points(data2, cex=0.1, pch=16)

# Mapping via RgoogleMaps
# Find map center and get map
center <- rev(sapply(rawdata2, mean))
map <- GetMap(center=center, zoom=12)
## sleptTotal= 0
# Translate original data
coords2 <- LatLon2XY.centered(map, rawdata2$lat, rawdata2$lon, 11)
coords2 <- data.frame(coords2)

# Rerun heatmap
#k2 <- kde2d(coords$newX, coords$newY, n=500)

# Create exponential transparency vector and add
alpha <- seq.int(0.5, 0.95, length.out=100)
alpha <- exp(alpha^6-1)
cols2 <- AddAlpha(cols, alpha)

# Plot
PlotOnStaticMap(map)
## [1] "Caution: map type is OpenStreetMap. Until we find the correct projection algorithm, we treat lat/lon like planar coordinates and set TrueProj = FALSE."
#image(k2, col=cols2, add=T)
points(coords2$newX, coords2$newY, pch=16, cex=0.3)

#------------------------------------------------#


crime_tv_2016 <- crime %>% filter(YEAR %in% "2016") %>% filter(TYPE %in% "Theft from Vehicle")

# Read in the data
rawdata3 <- data.frame(as.numeric(crime_tv_2016$Longitude), as.numeric(crime_tv_2016$Latitude))
names(rawdata3) <- c("lon", "lat")
data3 <- as.matrix(rawdata3)

# Rotate the lat-lon coordinates using a rotation matrix
# Trial and error lead to pi/15.0 = 12 degrees
theta = pi/15.0
m = matrix(c(cos(theta), sin(theta), -sin(theta), cos(theta)), nrow=2)
data3 <- as.matrix(data3) %*% m

# Reproduce William's original map
#par(bg='black')
#plot(data, cex=0.1, col="white", pch=16)

# Create heatmap with kde2d and overplot
k <- kde2d(data3[,1], data3[,2], n=500)

c <- kde2d(data3[,1], data3[,2], n=500)
# Intensity from green to red
cols <- rev(colorRampPalette(brewer.pal(8, 'BrBG'))(10000))
#par(bg='white')
#image(k, col=cols, xaxt='n', yaxt='n')
#points(data3, cex=0.1, pch=16)

# Mapping via RgoogleMaps
# Find map center and get map
center <- rev(sapply(rawdata3, mean))
map <- GetMap(center=center, zoom=12)
## sleptTotal= 0
# Translate original data
coords3 <- LatLon2XY.centered(map, rawdata3$lat, rawdata3$lon, 11)
coords3 <- data.frame(coords3)

# Rerun heatmap
#k2 <- kde2d(coords$newX, coords$newY, n=500)

# Create exponential transparency vector and add
alpha <- seq.int(0.5, 0.95, length.out=100)
alpha <- exp(alpha^6-1)
cols2 <- AddAlpha(cols, alpha)

# Plot
PlotOnStaticMap(map)
## [1] "Caution: map type is OpenStreetMap. Until we find the correct projection algorithm, we treat lat/lon like planar coordinates and set TrueProj = FALSE."
#image(k2, col=cols2, add=T)
points(coords3$newX, coords3$newY, pch=16, cex=0.3)

8.2 Heat Map: Theft from Vehicle type crimes of Year 2003, 2011, and 2016

image(a, col=cols, xaxt='n', yaxt='n') +
points(data1, cex=0.1, pch=16)

## integer(0)
image(b, col=cols, xaxt='n', yaxt='n') +
points(data2, cex=0.1, pch=16)

## integer(0)
image(c, col=cols, xaxt='n', yaxt='n') +
points(data3, cex=0.1, pch=16)

## integer(0)
  • Due to large volume of incidents, I present 3 maps of 2003, 2011, 2016 to manifest the crime rate for Theft from Vehicle over years

  • Theft from Vehicle type plays a major role in annual trend of crime and year 2003 is the first year of the survey, year 2011 is the year with the least crime incidents, and year 2016 is the last year of data with the full year data and most recent

  • The graphs show the overall crime rate trend through years , we can see how the crime rate has been changes

  • As the maps show, most crime incidents occur downtown, downtown east side, and south region of downtown vicinity